<!DOCTYPE HTML>
<html lang="zh-CN">


<head>
    <meta charset="utf-8">
    <meta name="keywords" content="爬虫,crawler,robots">
    <meta name="description" content="简单童真">
    <meta http-equiv="X-UA-Compatible" content="IE=edge">
    <meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=no">
    <meta name="renderer" content="webkit|ie-stand|ie-comp">
    <meta name="mobile-web-app-capable" content="yes">
    <meta name="format-detection" content="telephone=no">
    <meta name="apple-mobile-web-app-capable" content="yes">
    <meta name="apple-mobile-web-app-status-bar-style" content="black-translucent">
    <!-- Global site tag (gtag.js) - Google Analytics -->


    <title>爬虫的&#34;盗亦有道&#34;-Robots协议 | JacksonYoudi</title>
    <link rel="icon" type="image/png" href="/medias/mj-moon.png">

    <link rel="stylesheet" type="text/css" href="/libs/awesome/css/all.css">
    <link rel="stylesheet" type="text/css" href="/libs/materialize/materialize.min.css">
    <link rel="stylesheet" type="text/css" href="/libs/aos/aos.css">
    <link rel="stylesheet" type="text/css" href="/libs/animate/animate.min.css">
    <link rel="stylesheet" type="text/css" href="/libs/lightGallery/css/lightgallery.min.css">
    <link rel="stylesheet" type="text/css" href="/css/matery.css">
    <link rel="stylesheet" type="text/css" href="/css/my.css">

    <script src="/libs/jquery/jquery.min.js"></script>

<meta name="generator" content="Hexo 5.4.0">
<style>.github-emoji { position: relative; display: inline-block; width: 1.2em; min-height: 1.2em; overflow: hidden; vertical-align: top; color: transparent; }  .github-emoji > span { position: relative; z-index: 10; }  .github-emoji img, .github-emoji .fancybox { margin: 0 !important; padding: 0 !important; border: none !important; outline: none !important; text-decoration: none !important; user-select: none !important; cursor: auto !important; }  .github-emoji img { height: 1.2em !important; width: 1.2em !important; position: absolute !important; left: 50% !important; top: 50% !important; transform: translate(-50%, -50%) !important; user-select: none !important; cursor: auto !important; } .github-emoji-fallback { color: inherit; } .github-emoji-fallback img { opacity: 0 !important; }</style>
</head>




<body>
    <header class="navbar-fixed">
    <nav id="headNav" class="bg-color nav-transparent">
        <div id="navContainer" class="nav-wrapper container">
            <div class="brand-logo">
                <a href="/" class="waves-effect waves-light">
                    
                    <img src="/medias/mj-moon.png" class="logo-img" alt="LOGO">
                    
                    <span class="logo-span">JacksonYoudi</span>
                </a>
            </div>
            

<a href="#" data-target="mobile-nav" class="sidenav-trigger button-collapse"><i class="fas fa-bars"></i></a>
<ul class="right nav-menu">
  
  <li class="hide-on-med-and-down nav-item">
    
    <a href="/" class="waves-effect waves-light">
      
      <i class="fas fa-home" style="zoom: 0.6;"></i>
      
      <span>首页</span>
    </a>
    
  </li>
  
  <li class="hide-on-med-and-down nav-item">
    
    <a href="/categories" class="waves-effect waves-light">
      
      <i class="fas fa-bookmark" style="zoom: 0.6;"></i>
      
      <span>分类</span>
    </a>
    
  </li>
  
  <li class="hide-on-med-and-down nav-item">
    
    <a href="/archives" class="waves-effect waves-light">
      
      <i class="fas fa-archive" style="zoom: 0.6;"></i>
      
      <span>归档</span>
    </a>
    
  </li>
  
  <li class="hide-on-med-and-down nav-item">
    
    <a href="/tags" class="waves-effect waves-light">
      
      <i class="fas fa-tags" style="zoom: 0.6;"></i>
      
      <span>标签</span>
    </a>
    
  </li>
  
  <li class="hide-on-med-and-down nav-item">
    
    <a href="/about" class="waves-effect waves-light">
      
      <i class="fas fa-user-circle" style="zoom: 0.6;"></i>
      
      <span>关于</span>
    </a>
    
  </li>
  
  <li>
    <a href="#searchModal" class="modal-trigger waves-effect waves-light">
      <i id="searchIcon" class="fas fa-search" title="搜索" style="zoom: 0.85;"></i>
    </a>
  </li>
</ul>


<div id="mobile-nav" class="side-nav sidenav">

    <div class="mobile-head bg-color">
        
        <img src="/medias/mj-moon.png" class="logo-img circle responsive-img">
        
        <div class="logo-name">JacksonYoudi</div>
        <div class="logo-desc">
            
            简单童真
            
        </div>
    </div>

    

    <ul class="menu-list mobile-menu-list">
        
        <li class="m-nav-item">
	  
		<a href="/" class="waves-effect waves-light">
			
			    <i class="fa-fw fas fa-home"></i>
			
			首页
		</a>
          
        </li>
        
        <li class="m-nav-item">
	  
		<a href="/categories" class="waves-effect waves-light">
			
			    <i class="fa-fw fas fa-bookmark"></i>
			
			分类
		</a>
          
        </li>
        
        <li class="m-nav-item">
	  
		<a href="/archives" class="waves-effect waves-light">
			
			    <i class="fa-fw fas fa-archive"></i>
			
			归档
		</a>
          
        </li>
        
        <li class="m-nav-item">
	  
		<a href="/tags" class="waves-effect waves-light">
			
			    <i class="fa-fw fas fa-tags"></i>
			
			标签
		</a>
          
        </li>
        
        <li class="m-nav-item">
	  
		<a href="/about" class="waves-effect waves-light">
			
			    <i class="fa-fw fas fa-user-circle"></i>
			
			关于
		</a>
          
        </li>
        
        
        <li><div class="divider"></div></li>
        <li>
            <a href="https://github.com/jacksonyoudi" class="waves-effect waves-light" target="_blank">
                <i class="fab fa-github-square fa-fw"></i>Fork Me
            </a>
        </li>
        
    </ul>
</div>


        </div>

        
            <style>
    .nav-transparent .github-corner {
        display: none !important;
    }

    .github-corner {
        position: absolute;
        z-index: 10;
        top: 0;
        right: 0;
        border: 0;
        transform: scale(1.1);
    }

    .github-corner svg {
        color: #0f9d58;
        fill: #fff;
        height: 64px;
        width: 64px;
    }

    .github-corner:hover .octo-arm {
        animation: a 0.56s ease-in-out;
    }

    .github-corner .octo-arm {
        animation: none;
    }

    @keyframes a {
        0%,
        to {
            transform: rotate(0);
        }
        20%,
        60% {
            transform: rotate(-25deg);
        }
        40%,
        80% {
            transform: rotate(10deg);
        }
    }
</style>

<a href="https://github.com/jacksonyoudi" class="github-corner tooltipped hide-on-med-and-down" target="_blank"
   data-tooltip="Fork Me" data-position="left" data-delay="50">
    <svg viewBox="0 0 250 250" aria-hidden="true">
        <path d="M0,0 L115,115 L130,115 L142,142 L250,250 L250,0 Z"></path>
        <path d="M128.3,109.0 C113.8,99.7 119.0,89.6 119.0,89.6 C122.0,82.7 120.5,78.6 120.5,78.6 C119.2,72.0 123.4,76.3 123.4,76.3 C127.3,80.9 125.5,87.3 125.5,87.3 C122.9,97.6 130.6,101.9 134.4,103.2"
              fill="currentColor" style="transform-origin: 130px 106px;" class="octo-arm"></path>
        <path d="M115.0,115.0 C114.9,115.1 118.7,116.5 119.8,115.4 L133.7,101.6 C136.9,99.2 139.9,98.4 142.2,98.6 C133.8,88.0 127.5,74.4 143.8,58.0 C148.5,53.4 154.0,51.2 159.7,51.0 C160.3,49.4 163.2,43.6 171.4,40.1 C171.4,40.1 176.1,42.5 178.8,56.2 C183.1,58.6 187.2,61.8 190.9,65.4 C194.5,69.0 197.7,73.2 200.1,77.6 C213.8,80.2 216.3,84.9 216.3,84.9 C212.7,93.1 206.9,96.0 205.4,96.6 C205.1,102.4 203.0,107.8 198.3,112.5 C181.9,128.9 168.3,122.5 157.7,114.1 C157.9,116.9 156.7,120.9 152.7,124.9 L141.0,136.5 C139.8,137.7 141.6,141.9 141.8,141.8 Z"
              fill="currentColor" class="octo-body"></path>
    </svg>
</a>
        
    </nav>

</header>

    



<div class="bg-cover pd-header post-cover" style="background-image: url('https://raw.githubusercontent.com/jacksonyoudi/images/main/uPic/k6GFvZ.jpg')">
    <div class="container" style="right: 0px;left: 0px;">
        <div class="row">
            <div class="col s12 m12 l12">
                <div class="brand">
                    <h1 class="description center-align post-title">爬虫的&#34;盗亦有道&#34;-Robots协议</h1>
                </div>
            </div>
        </div>
    </div>
</div>




<main class="post-container content">

    
    <link rel="stylesheet" href="/libs/tocbot/tocbot.css">
<style>
    #articleContent h1::before,
    #articleContent h2::before,
    #articleContent h3::before,
    #articleContent h4::before,
    #articleContent h5::before,
    #articleContent h6::before {
        display: block;
        content: " ";
        height: 100px;
        margin-top: -100px;
        visibility: hidden;
    }

    #articleContent :focus {
        outline: none;
    }

    .toc-fixed {
        position: fixed;
        top: 64px;
    }

    .toc-widget {
        width: 345px;
        padding-left: 20px;
    }

    .toc-widget .toc-title {
        padding: 35px 0 15px 17px;
        font-size: 1.5rem;
        font-weight: bold;
        line-height: 1.5rem;
    }

    .toc-widget ol {
        padding: 0;
        list-style: none;
    }

    #toc-content {
        padding-bottom: 30px;
        overflow: auto;
    }

    #toc-content ol {
        padding-left: 10px;
    }

    #toc-content ol li {
        padding-left: 10px;
    }

    #toc-content .toc-link:hover {
        color: #42b983;
        font-weight: 700;
        text-decoration: underline;
    }

    #toc-content .toc-link::before {
        background-color: transparent;
        max-height: 25px;

        position: absolute;
        right: 23.5vw;
        display: block;
    }

    #toc-content .is-active-link {
        color: #42b983;
    }

    #floating-toc-btn {
        position: fixed;
        right: 15px;
        bottom: 76px;
        padding-top: 15px;
        margin-bottom: 0;
        z-index: 998;
    }

    #floating-toc-btn .btn-floating {
        width: 48px;
        height: 48px;
    }

    #floating-toc-btn .btn-floating i {
        line-height: 48px;
        font-size: 1.4rem;
    }
</style>
<div class="row">
    <div id="main-content" class="col s12 m12 l9">
        <!-- 文章内容详情 -->
<div id="artDetail">
    <div class="card">
        <div class="card-content article-info">
            <div class="row tag-cate">
                <div class="col s7">
                    
                    <div class="article-tag">
                        
                            <a href="/tags/crawler/">
                                <span class="chip bg-color">crawler</span>
                            </a>
                        
                            <a href="/tags/%E7%88%AC%E8%99%AB/">
                                <span class="chip bg-color">爬虫</span>
                            </a>
                        
                            <a href="/tags/robots/">
                                <span class="chip bg-color">robots</span>
                            </a>
                        
                    </div>
                    
                </div>
                <div class="col s5 right-align">
                    
                    <div class="post-cate">
                        <i class="fas fa-bookmark fa-fw icon-category"></i>
                        
                            <a href="/categories/%E6%8A%80%E6%9C%AF/" class="post-category">
                                技术
                            </a>
                        
                    </div>
                    
                </div>
            </div>

            <div class="post-info">
                
                <div class="post-date info-break-policy">
                    <i class="far fa-calendar-minus fa-fw"></i>发布日期:&nbsp;&nbsp;
                    2021-07-31
                </div>
                

                

                

                

                
            </div>
        </div>
        <hr class="clearfix">

        
        <!-- 是否加载使用自带的 prismjs. -->
        <link rel="stylesheet" href="/libs/prism/prism.css">
        

        

        <div class="card-content article-card-content">
            <div id="articleContent">
                <h2 id="网络爬虫的君子协议"><a href="#网络爬虫的君子协议" class="headerlink" title="网络爬虫的君子协议"></a>网络爬虫的君子协议</h2><p><img src="https://raw.githubusercontent.com/jacksonyoudi/images/main/uPic/3tqLVS.jpg"></p>
<h3 id="网络爬虫的尺寸"><a href="#网络爬虫的尺寸" class="headerlink" title="网络爬虫的尺寸"></a>网络爬虫的尺寸</h3><table>
<thead>
<tr>
<th>小规模，数量小，爬去速度不敏感，requests库</th>
<th>中规模，数据规模较大，爬取速度敏感scrapy库</th>
<th>大规模，搜索引擎,爬取速度关键定制开发</th>
</tr>
</thead>
<tbody><tr>
<td>爬取网页 玩转网页</td>
<td>爬取网站 爬取系列网站</td>
<td>爬取全网</td>
</tr>
</tbody></table>
<h3 id="网络爬虫引发的问题"><a href="#网络爬虫引发的问题" class="headerlink" title="网络爬虫引发的问题"></a>网络爬虫引发的问题</h3><ul>
<li>性能骚扰</li>
<li>法律风险</li>
<li>隐私泄露</li>
</ul>
<p><em>网络爬虫的”性能骚扰”</em><br>web服务器默认接受人类访问，受限于编写水平和目的，网络爬虫将会为web服务器带来巨大的资源的开销。<br><em>网络爬虫的法律风险</em><br>服务器上的数据有产权归属，网络爬虫获取数据后牟利将会带来法律的风险。<br><em>网络爬虫的隐私泄露</em><br>网络爬虫可能具备突破简单访问的控制能力，获取被保护的数据，从而泄露个人隐私。</p>
<h3 id="网络爬虫的限制"><a href="#网络爬虫的限制" class="headerlink" title="网络爬虫的限制"></a>网络爬虫的限制</h3><ul>
<li>来源审查：判断<code>User-Agent</code>进行限制，检查来访者HTTP协议头的User-Agent域，只响应浏览器或友好爬虫的访问</li>
<li>发布公告： <code>Robots</code>协议， 告知所有的爬虫网站的爬虫策略，要求爬虫遵守。</li>
</ul>
<h2 id="Robots协议"><a href="#Robots协议" class="headerlink" title="Robots协议"></a>Robots协议</h2><blockquote>
<p>Robots协议（也称为爬虫协议、机器人协议等）的全称是“网络爬虫排除标准”（Robots ExclusionProtocol），网站通过Robots协议告诉搜索引擎哪些页面可以抓取，哪些页面不能抓取.</p>
</blockquote>
<p>根据协议，网站管理员可以在网站域名的根目录下放一个robots.txt 文本文件，里面可以指定不同的网络爬虫能访问的页面和禁止访问的页面，指定的页面由正则表达式表示。网络爬虫在采集这个网站之前，首先获取到这个文件，然后解析到其中的规则，然后根据规则来采集网站的数据。</p>
<p>注意，这个协议的存在更多的是需要网络爬虫去遵守，而起不到防止爬虫的功能。</p>
<h3 id="为什么需要Robots协议"><a href="#为什么需要Robots协议" class="headerlink" title="为什么需要Robots协议"></a>为什么需要Robots协议</h3><p>互联网上的网页是通过超级链接互相关联起来的，从而形成了网页的网状结构。爬虫的工作方式就像蜘蛛在网上沿着链接爬来爬去，最基本的流程可以简化如下：</p>
<ol>
<li>喂给爬虫一堆url，我们称之为种子(seeds)；</li>
<li>爬虫抓取seeds，解析html网页，抽取其中的超级链接；</li>
<li>爬虫接着抓取这些新发现的链接指向的网页。</li>
</ol>
<p>步骤2和步骤3循环往复。</p>
<p>了解了上面的流程就能发现：对爬虫来说网站非常被动，只有老老实实被抓取的份。</p>
<p>所以，对于网站的管理者来说，就存在这样的需求：</p>
<p>某些路径下是个人隐私或者网站管理使用，不想被搜索引擎抓取，比如说日本爱情动作片；<br>不喜欢某个搜索引擎，不愿意被他抓取，最有名的就是之前淘宝不希望被百度抓取；<br>小网站使用的是公用的虚拟主机，流量有限或者需要付费，希望搜索引擎抓的温柔点；<br>某些网页是动态生成的，没有直接的链接指向，但是希望内容被搜索引擎抓取和索引。</p>
<p>网站内容的所有者是网站管理员，搜索引擎应该尊重所有者的意愿，为了满足以上等等，就需要提供一种网站和爬虫进行沟通的途径，给网站管理员表达自己意愿的机会。有需求就有供应，robots协议就此诞生。</p>
<h4 id="案例"><a href="#案例" class="headerlink" title="案例"></a>案例</h4><p>京东的Robots协议<br><code>https://www.jd.com/robots.txt</code></p>
<pre class="line-numbers language-javascript" data-language="javascript"><code class="language-javascript">User<span class="token operator">-</span>agent<span class="token operator">:</span> <span class="token operator">*</span> 
Disallow<span class="token operator">:</span> <span class="token operator">/</span><span class="token operator">?</span><span class="token operator">*</span> 
Disallow<span class="token operator">:</span> <span class="token operator">/</span>pop<span class="token comment">/*.html 
Disallow: /pinpai/*.html?* 
User-agent: EtaoSpider 
Disallow: / 
User-agent: HuihuiSpider 
Disallow: / 
User-agent: GwdangSpider 
Disallow: / 
User-agent: WochachaSpider 
Disallow: /</span><span aria-hidden="true" class="line-numbers-rows"><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span></span></code></pre>
<p>百度的Robots协议<br><code>https://www.baidu.com/robots.txt</code></p>
<pre class="line-numbers language-bash" data-language="bash"><code class="language-bash">User-agent: Baiduspider
Disallow: /baidu
Disallow: /s?
Disallow: /ulink?
Disallow: /link?

User-agent: Googlebot
Disallow: /baidu
Disallow: /s?
Disallow: /shifen/
Disallow: /homepage/
Disallow: /cpro
Disallow: /ulink?
Disallow: /link?

User-agent: MSNBot
Disallow: /baidu
Disallow: /s?
Disallow: /shifen/
Disallow: /homepage/
Disallow: /cpro
Disallow: /ulink?
Disallow: /link?

User-agent: Baiduspider-image
Disallow: /baidu
Disallow: /s?
Disallow: /shifen/
Disallow: /homepage/
Disallow: /cpro
Disallow: /ulink?
Disallow: /link?

User-agent: YoudaoBot
Disallow: /baidu
Disallow: /s?
Disallow: /shifen/
Disallow: /homepage/
Disallow: /cpro
Disallow: /ulink?
Disallow: /link?

User-agent: Sogou web spider
Disallow: /baidu
Disallow: /s?
Disallow: /shifen/
Disallow: /homepage/
Disallow: /cpro
Disallow: /ulink?
Disallow: /link?

User-agent: Sogou inst spider
Disallow: /baidu
Disallow: /s?
Disallow: /shifen/
Disallow: /homepage/
Disallow: /cpro
Disallow: /ulink?
Disallow: /link?

User-agent: Sogou spider2
Disallow: /baidu
Disallow: /s?
Disallow: /shifen/
Disallow: /homepage/
Disallow: /cpro
Disallow: /ulink?
Disallow: /link?

User-agent: Sogou blog
Disallow: /baidu
Disallow: /s?
Disallow: /shifen/
Disallow: /homepage/
Disallow: /cpro
Disallow: /ulink?
Disallow: /link?

User-agent: Sogou News Spider
Disallow: /baidu
Disallow: /s?
Disallow: /shifen/
Disallow: /homepage/
Disallow: /cpro
Disallow: /ulink?
Disallow: /link?

User-agent: Sogou Orion spider
Disallow: /baidu
Disallow: /s?
Disallow: /shifen/
Disallow: /homepage/
Disallow: /cpro
Disallow: /ulink?
Disallow: /link?

User-agent: ChinasoSpider
Disallow: /baidu
Disallow: /s?
Disallow: /shifen/
Disallow: /homepage/
Disallow: /cpro
Disallow: /ulink?
Disallow: /link?

User-agent: Sosospider
Disallow: /baidu
Disallow: /s?
Disallow: /shifen/
Disallow: /homepage/
Disallow: /cpro
Disallow: /ulink?
Disallow: /link?


User-agent: yisouspider
Disallow: /baidu
Disallow: /s?
Disallow: /shifen/
Disallow: /homepage/
Disallow: /cpro
Disallow: /ulink?
Disallow: /link?

User-agent: EasouSpider
Disallow: /baidu
Disallow: /s?
Disallow: /shifen/
Disallow: /homepage/
Disallow: /cpro
Disallow: /ulink?
Disallow: /link?

User-agent: *
Disallow: /<span aria-hidden="true" class="line-numbers-rows"><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span><span></span></span></code></pre>

<p>上面，<code>*</code>代表所有，<code>/</code>代表根目录</p>
<h3 id="Robots协议的写法"><a href="#Robots协议的写法" class="headerlink" title="Robots协议的写法"></a>Robots协议的写法</h3><p>既然网络爬虫在爬取一个网站之前，要先获取到这个文件，然后解析到其中的规则，那么，Robots就必须要有一套通用的语法规则。</p>
<p>最简单的robots.txt只有两条规则：</p>
<p>User-agent：指定对哪些爬虫生效<br>Disallow：指定要屏蔽的网址<br>先说User-agent，爬虫抓取时会声明自己的身份，这就是User-agent，没错，就是http协议里的User-agent。robots.txt利用User-agent来区分各个引擎的爬虫，比如说google网页搜索爬虫的User-agent为Googlebot。</p>
<p>可能有读者要问了，我怎么知道爬虫的User-agent是什么?你还可以查相关搜索引擎的资料得到官方的数据，比如说百度的爬虫列表是这样的：</p>
<table>
<thead>
<tr>
<th>产品名称</th>
<th>对应User-Agent</th>
</tr>
</thead>
<tbody><tr>
<td>网页搜索</td>
<td>Baiduspider</td>
</tr>
<tr>
<td>移动搜索</td>
<td>Baiduspider</td>
</tr>
<tr>
<td>图片搜索</td>
<td>Baiduspider-image</td>
</tr>
<tr>
<td>视频搜索</td>
<td>Baiduspider-video</td>
</tr>
<tr>
<td>新闻搜索</td>
<td>Baiduspider-news</td>
</tr>
<tr>
<td>百度搜索</td>
<td>Baiduspider-favo</td>
</tr>
<tr>
<td>百度联盟</td>
<td>Baiduspider-cpro</td>
</tr>
<tr>
<td>商务搜索</td>
<td>Baiduspider-ads</td>
</tr>
</tbody></table>
<p>Disallow 行列出的是要拦截的网页，以正斜线 (/) 开头，可以列出特定的网址或模式。要屏蔽整个网站，使用正斜线即可;要屏蔽某一目录以及其中的所有内容，在目录名后添加正斜线;要屏蔽某个具体的网页，就指出这个网页。</p>
<p>下面介绍一些实例:<br>允许所有的robot访问</p>
<pre class="line-numbers language-bash" data-language="bash"><code class="language-bash">User-agent: *
Disallow:<span aria-hidden="true" class="line-numbers-rows"><span></span><span></span></span></code></pre>

<p>或者也可以建一个空文件 “/robots.txt” file。</p>
<p>禁止爬虫访问所有目录</p>
<pre class="line-numbers language-bash" data-language="bash"><code class="language-bash">User-agent: *
Disallow: /<span aria-hidden="true" class="line-numbers-rows"><span></span><span></span></span></code></pre>
<p>禁止爬虫访问某些目录</p>
<pre class="line-numbers language-bash" data-language="bash"><code class="language-bash">User-agent: *
Disallow: /a/
Disallow: /b/
Disallow: /c/<span aria-hidden="true" class="line-numbers-rows"><span></span><span></span><span></span><span></span></span></code></pre>
<p>禁止某些爬虫访问</p>
<pre class="line-numbers language-bash" data-language="bash"><code class="language-bash">User-agent: BadBot
Disallow: /<span aria-hidden="true" class="line-numbers-rows"><span></span><span></span></span></code></pre>
<p>只允许某个爬虫访问</p>
<pre class="line-numbers language-bash" data-language="bash"><code class="language-bash">User-agent: MangCrawler
Disallow:
User-agent: *
Disallow: /<span aria-hidden="true" class="line-numbers-rows"><span></span><span></span><span></span><span></span></span></code></pre>
<p>我们再来结合两个真实的范例来学习一下。先看这个例子：</p>
<pre class="line-numbers language-bash" data-language="bash"><code class="language-bash">User-agent: Baiduspider
Disallow: /
User-agent: baiduspider
Disallow: /<span aria-hidden="true" class="line-numbers-rows"><span></span><span></span><span></span><span></span></span></code></pre>
<p>这个是淘宝网的Robots协议内容，相信你已经看出来了，淘宝网禁止百度的爬虫访问。</p>
<p>再来看一个例子：</p>
<pre class="line-numbers language-bash" data-language="bash"><code class="language-bash">User-agent: *
Disallow: /?*
Disallow: /pop/*.html
User-agent: EtaoSpider
Disallow: /<span aria-hidden="true" class="line-numbers-rows"><span></span><span></span><span></span><span></span><span></span></span></code></pre>
<p>这个稍微复杂点，京东有2个目录不希望所有的爬虫来抓。同时，京东完全屏蔽了一淘网的蜘蛛（EtaoSpider是一淘网的蜘蛛）。</p>
<h2 id="Robots协议进阶知识"><a href="#Robots协议进阶知识" class="headerlink" title="Robots协议进阶知识"></a>Robots协议进阶知识</h2><p><strong>sitemap</strong><br>爬虫会通过网页内部的链接发现新的网页。但是如果没有连接指向的网页怎么办?或者用户输入条件生成的动态网页怎么办?能否让网站管理员通知搜索引擎他们网站上有哪些可供抓取的网页?这就是sitemap，最简单的 Sitepmap 形式就是 XML 文件，在其中列出网站中的网址以及关于每个网址的其他数据(上次更新的时间、更改的频率以及相对于网站上其他网址的重要程度等等)，利用这些信息搜索引擎可以更加智能地抓取网站内容。</p>
<p>新的问题来了，爬虫怎么知道这个网站有没有提供sitemap文件，或者说网站管理员生成了sitemap，(可能是多个文件)，爬虫怎么知道放在哪里呢?</p>
<p>由于robots.txt的位置是固定的，于是大家就想到了把sitemap的位置信息放在robots.txt里。这就成为robots.txt里的新成员了。</p>
<p>节选一段google robots.txt：</p>
<pre class="line-numbers language-bash" data-language="bash"><code class="language-bash">Sitemap: http://www.gstatic.com/cultur<span class="token punctuation">..</span>.
Sitemap: http://www.google.com/hostedn<span class="token punctuation">..</span>.<span aria-hidden="true" class="line-numbers-rows"><span></span><span></span></span></code></pre>
<p>插一句，考虑到一个网站的网页众多，sitemap人工维护不太靠谱，google提供了工具可以自动生成sitemap。</p>
<p><strong>meta tag</strong><br>其实严格来说这部分内容不属于robots.txt。</p>
<p>robots.txt的初衷是为了让网站管理员管理可以出现在搜索引擎里的网站内容。但是，即使使用 robots.txt 文件让爬虫无法抓取这些内容，搜索引擎也可以通过其他方式找到这些网页并将它添加到索引中。例如，其他网站仍可能链接到该网站。因此，网页网址及其他公开的信息(如指向相关网站的链接中的定位文字或开放式目录管理系统中的标题)有可能会出现在引擎的搜索结果中。如果想彻底对搜索引擎隐身那咋整呢?答案是：元标记，即meta tag。</p>
<p>比如要完全阻止一个网页的内容列在搜索引擎索引中(即使有其他网站链接到此网页)，可使用 noindex 元标记。只要搜索引擎查看该网页，便会看到 noindex 元标记并阻止该网页显示在索引中，这里注意noindex元标记提供的是一种逐页控制对网站的访问的方式。</p>
<p>要防止所有搜索引擎将网站中的网页编入索引，在网页的部分添加：</p>
<pre class="line-numbers language-vbscript-html" data-language="vbscript-html"><code class="language-vbscript-html">&lt;meta name="robots" content="noindex"&gt;<span aria-hidden="true" class="line-numbers-rows"><span></span></span></code></pre>
<p>这里的name取值可以设置为某个搜索引擎的User-agent从而指定屏蔽某一个搜索引擎。</p>
<p>除了noindex外，还有其他元标记，比如说nofollow，禁止爬虫从此页面中跟踪链接。详细信息可以参考Google支持的元标记，这里提一句：noindex和nofollow在HTML 4.01规范里有描述，但是其他tag的在不同引擎支持到什么程度各不相同，还请读者自行查阅各个引擎的说明文档。</p>
<p><strong>Crawl-delay</strong><br>除了控制哪些可以抓哪些不能抓之外，robots.txt还可以用来控制爬虫抓取的速率。如何做到的呢?通过设置爬虫在两次抓取之间等待的秒数。这种操作可以进行缓解服务器压力。</p>
<pre class="line-numbers language-bash" data-language="bash"><code class="language-bash">Crawl-delay:5<span aria-hidden="true" class="line-numbers-rows"><span></span></span></code></pre>
<p>表示本次抓取后下一次抓取前需要等待5秒。</p>
<p>注意：google已经不支持这种方式了，在webmaster tools里提供了一个功能可以更直观的控制抓取速率。</p>
<p>这里插一句题外话，几年前曾经有一段时间robots.txt还支持复杂的参数:Visit-time，只有在visit-time指定的时间段里，爬虫才可以访问;Request-rate: 用来限制URL的读取频率，用于控制不同的时间段采用不同的抓取速率。后来估计支持的人太少，就渐渐的废掉了，目前google和baidu都已经不支持这个规则了，其他小的引擎公司貌似从来都没有支持过。</p>
<h3 id="Robots协议的遵守方式"><a href="#Robots协议的遵守方式" class="headerlink" title="Robots协议的遵守方式"></a>Robots协议的遵守方式</h3><p>网络爬虫：<br>自动或人工识别rotbots.txt，再进行内容爬取<br>约束性:<br>Robots协议是建议但非约束性，网络爬虫可以不遵守，但存在法律风险。</p>
<h3 id="对Robots协议的理解"><a href="#对Robots协议的理解" class="headerlink" title="对Robots协议的理解"></a>对Robots协议的理解</h3><table>
<thead>
<tr>
<th>访问量小:可以遵守<br>访问量较大：建议遵守</th>
<th>非商业且偶尔:建议遵守<br>商业利益:必须遵守</th>
<th>必须遵守</th>
</tr>
</thead>
<tbody><tr>
<td>爬取网页 玩转网页</td>
<td>爬取网站 爬取系列网站</td>
<td>爬取全网</td>
</tr>
<tr>
<td><code>原则</code>：类人行为可以不参考Robots协议。</td>
<td></td>
<td></td>
</tr>
</tbody></table>
<h3 id="防君子不防小人（君子协议）"><a href="#防君子不防小人（君子协议）" class="headerlink" title="防君子不防小人（君子协议）"></a>防君子不防小人（君子协议）</h3><p>Robots协议不是什么技术壁垒，而只是一种互相尊重的协议，好比私家花园的门口挂着“闲人免进”，尊重者绕道而行，不尊重者依然可以推门而入。目前，Robots协议在实际使用中，还存在一些问题。</p>
<h3 id="缓存"><a href="#缓存" class="headerlink" title="缓存"></a>缓存</h3><p>robots.txt本身也是需要抓取的，出于效率考虑，一般爬虫不会每次抓取网站网页前都抓一下robots.txt，加上robots.txt更新不频繁，内容需要解析。通常爬虫的做法是先抓取一次，解析后缓存下来，而且是相当长的时间。假设网站管理员更新了robots.txt，修改了某些规则，但是对爬虫来说并不会立刻生效，只有当爬虫下次抓取robots.txt之后才能看到最新的内容。尴尬的是，爬虫下次抓取robots.txt的时间并不是由网站管理员控制的。当然，有些搜索引擎提供了web 工具可以让网站管理员通知搜索引擎那个url发生了变化，建议重新抓取。注意，此处是建议，即使你通知了搜索引擎，搜索引擎何时抓取仍然是不确定的，只是比完全不通知要好点。至于好多少，那就看搜索引擎的良心和技术能力了。</p>
<h3 id="ignore"><a href="#ignore" class="headerlink" title="ignore"></a>ignore</h3><p>不知是无意还是有意，反正有些爬虫不太遵守或者完全忽略robots.txt，不排除开发人员能力的问题，比如说根本不知道robots.txt。另外，本身robots.txt不是一种强制措施，如果网站有数据需要保密，必需采取技术措施，比如说：用户验证，加密，ip拦截，访问频率控制等。</p>
<p><img src="https://raw.githubusercontent.com/jacksonyoudi/images/main/uPic/nGoP78.jpg"></p>
<h3 id="恶意爬虫"><a href="#恶意爬虫" class="headerlink" title="恶意爬虫"></a>恶意爬虫</h3><p>在互联网世界中，每天都有不计其数的爬虫在日夜不息地爬取数据，其中恶意爬虫的数量甚至高于非恶意爬虫。遵守Robots协议的爬虫才是好爬虫，但是并不是每个爬虫都会主动遵守Robots协议。</p>
<p>恶意爬虫可以带来很多潜在威胁，比如电商网站的商品信息被爬取可能会被竞争对手利用，过多的爬虫还会占用带宽资源、甚至导致网站宕机。</p>
<p>反恶意爬虫是一件漫长而艰巨的任务，如果依靠自身实力难以解决，可以借助岂安科技的业务风险分析平台 WARDEN 来反恶意爬虫，根据自己的需求来定制功能。</p>

                
            </div>
            <hr/>

            

    <div class="reprint" id="reprint-statement">
        
            <div class="reprint__author">
                <span class="reprint-meta" style="font-weight: bold;">
                    <i class="fas fa-user">
                        文章作者:
                    </i>
                </span>
                <span class="reprint-info">
                    <a href="/about" rel="external nofollow noreferrer">Jackson</a>
                </span>
            </div>
            <div class="reprint__type">
                <span class="reprint-meta" style="font-weight: bold;">
                    <i class="fas fa-link">
                        文章链接:
                    </i>
                </span>
                <span class="reprint-info">
                    <a href="https://www.jacksonyoudi.com/2021/07/31/robots-protocol/">https://www.jacksonyoudi.com/2021/07/31/robots-protocol/</a>
                </span>
            </div>
            <div class="reprint__notice">
                <span class="reprint-meta" style="font-weight: bold;">
                    <i class="fas fa-copyright">
                        版权声明:
                    </i>
                </span>
                <span class="reprint-info">
                    本博客所有文章除特別声明外，均采用
                    <a href="https://creativecommons.org/licenses/by/4.0/deed.zh" rel="external nofollow noreferrer" target="_blank">CC BY 4.0</a>
                    许可协议。转载请注明来源
                    <a href="/about" target="_blank">Jackson</a>
                    !
                </span>
            </div>
        
    </div>

    <script async defer>
      document.addEventListener("copy", function (e) {
        let toastHTML = '<span>复制成功，请遵循本文的转载规则</span><button class="btn-flat toast-action" onclick="navToReprintStatement()" style="font-size: smaller">查看</a>';
        M.toast({html: toastHTML})
      });

      function navToReprintStatement() {
        $("html, body").animate({scrollTop: $("#reprint-statement").offset().top - 80}, 800);
      }
    </script>



            <div class="tag_share" style="display: block;">
                <div class="post-meta__tag-list" style="display: inline-block;">
                    
                        <div class="article-tag">
                            
                                <a href="/tags/crawler/">
                                    <span class="chip bg-color">crawler</span>
                                </a>
                            
                                <a href="/tags/%E7%88%AC%E8%99%AB/">
                                    <span class="chip bg-color">爬虫</span>
                                </a>
                            
                                <a href="/tags/robots/">
                                    <span class="chip bg-color">robots</span>
                                </a>
                            
                        </div>
                    
                </div>
                <div class="post_share" style="zoom: 80%; width: fit-content; display: inline-block; float: right; margin: -0.15rem 0;">
                    <link rel="stylesheet" type="text/css" href="/libs/share/css/share.min.css">
<div id="article-share">

    
    <div class="social-share" data-sites="twitter,facebook,google,qq,qzone,wechat,weibo,douban,linkedin" data-wechat-qrcode-helper="<p>微信扫一扫即可分享！</p>"></div>
    <script src="/libs/share/js/social-share.min.js"></script>
    

    

</div>

                </div>
            </div>
            
        </div>
    </div>

    

    

    

    

    

    

    

    

    

<article id="prenext-posts" class="prev-next articles">
    <div class="row article-row">
        
        <div class="article col s12 m6" data-aos="fade-up">
            <div class="article-badge left-badge text-color">
                <i class="fas fa-chevron-left"></i>&nbsp;上一篇</div>
            <div class="card">
                <a href="/2021/08/01/spark-load-to-es/">
                    <div class="card-image">
                        
                        <img src="https://raw.githubusercontent.com/jacksonyoudi/images/main/uPic/H8s52K.jpg" class="responsive-img" alt="spark加载数据到ES">
                        
                        <span class="card-title">spark加载数据到ES</span>
                    </div>
                </a>
                <div class="card-content article-content">
                    <div class="summary block-with-text">
                        
                            spark多种方式将数据写入的elasticsearch，会通过源码介绍scala的隐式转换进行mixin方式实现不侵入实现RDD具有saveToES功能
                        
                    </div>
                    <div class="publish-info">
                        <span class="publish-date">
                            <i class="far fa-clock fa-fw icon-date"></i>2021-08-01
                        </span>
                        <span class="publish-author">
                            
                            <i class="fas fa-bookmark fa-fw icon-category"></i>
                            
                            <a href="/categories/%E5%A4%A7%E6%95%B0%E6%8D%AE/" class="post-category">
                                    大数据
                                </a>
                            
                            
                        </span>
                    </div>
                </div>
                
                <div class="card-action article-tags">
                    
                    <a href="/tags/spark/">
                        <span class="chip bg-color">spark</span>
                    </a>
                    
                    <a href="/tags/es/">
                        <span class="chip bg-color">es</span>
                    </a>
                    
                    <a href="/tags/%E5%A4%A7%E6%95%B0%E6%8D%AE/">
                        <span class="chip bg-color">大数据</span>
                    </a>
                    
                </div>
                
            </div>
        </div>
        
        
        <div class="article col s12 m6" data-aos="fade-up">
            <div class="article-badge right-badge text-color">
                下一篇&nbsp;<i class="fas fa-chevron-right"></i>
            </div>
            <div class="card">
                <a href="/2021/07/31/hello-world/">
                    <div class="card-image">
                        
                        
                        <img src="/medias/featureimages/12.jpg" class="responsive-img" alt="Hello World">
                        
                        <span class="card-title">Hello World</span>
                    </div>
                </a>
                <div class="card-content article-content">
                    <div class="summary block-with-text">
                        
                            
                        
                    </div>
                    <div class="publish-info">
                            <span class="publish-date">
                                <i class="far fa-clock fa-fw icon-date"></i>2021-07-31
                            </span>
                        <span class="publish-author">
                            
                            <i class="fas fa-user fa-fw"></i>
                            Jackson
                            
                        </span>
                    </div>
                </div>
                
            </div>
        </div>
        
    </div>
</article>

</div>


<script>
    $('#articleContent').on('copy', function (e) {
        // IE8 or earlier browser is 'undefined'
        if (typeof window.getSelection === 'undefined') return;

        var selection = window.getSelection();
        // if the selection is short let's not annoy our users.
        if (('' + selection).length < Number.parseInt('120')) {
            return;
        }

        // create a div outside of the visible area and fill it with the selected text.
        var bodyElement = document.getElementsByTagName('body')[0];
        var newdiv = document.createElement('div');
        newdiv.style.position = 'absolute';
        newdiv.style.left = '-99999px';
        bodyElement.appendChild(newdiv);
        newdiv.appendChild(selection.getRangeAt(0).cloneContents());

        // we need a <pre> tag workaround.
        // otherwise the text inside "pre" loses all the line breaks!
        if (selection.getRangeAt(0).commonAncestorContainer.nodeName === 'PRE' || selection.getRangeAt(0).commonAncestorContainer.nodeName === 'CODE') {
            newdiv.innerHTML = "<pre>" + newdiv.innerHTML + "</pre>";
        }

        var url = document.location.href;
        newdiv.innerHTML += '<br />'
            + '来源: JacksonYoudi<br />'
            + '文章作者: Jackson<br />'
            + '文章链接: <a href="' + url + '">' + url + '</a><br />'
            + '本文章著作权归作者所有，任何形式的转载都请注明出处。';

        selection.selectAllChildren(newdiv);
        window.setTimeout(function () {bodyElement.removeChild(newdiv);}, 200);
    });
</script>


<!-- 代码块功能依赖 -->
<script type="text/javascript" src="/libs/codeBlock/codeBlockFuction.js"></script>

<!-- 代码语言 -->

<script type="text/javascript" src="/libs/codeBlock/codeLang.js"></script>


<!-- 代码块复制 -->

<script type="text/javascript" src="/libs/codeBlock/codeCopy.js"></script>


<!-- 代码块收缩 -->

<script type="text/javascript" src="/libs/codeBlock/codeShrink.js"></script>


    </div>
    <div id="toc-aside" class="expanded col l3 hide-on-med-and-down">
        <div class="toc-widget card" style="background-color: white;">
            <div class="toc-title"><i class="far fa-list-alt"></i>&nbsp;&nbsp;目录</div>
            <div id="toc-content"></div>
        </div>
    </div>
</div>

<!-- TOC 悬浮按钮. -->

<div id="floating-toc-btn" class="hide-on-med-and-down">
    <a class="btn-floating btn-large bg-color">
        <i class="fas fa-list-ul"></i>
    </a>
</div>


<script src="/libs/tocbot/tocbot.min.js"></script>
<script>
    $(function () {
        tocbot.init({
            tocSelector: '#toc-content',
            contentSelector: '#articleContent',
            headingsOffset: -($(window).height() * 0.4 - 45),
            collapseDepth: Number('0'),
            headingSelector: 'h2, h3, h4'
        });

        // modify the toc link href to support Chinese.
        let i = 0;
        let tocHeading = 'toc-heading-';
        $('#toc-content a').each(function () {
            $(this).attr('href', '#' + tocHeading + (++i));
        });

        // modify the heading title id to support Chinese.
        i = 0;
        $('#articleContent').children('h2, h3, h4').each(function () {
            $(this).attr('id', tocHeading + (++i));
        });

        // Set scroll toc fixed.
        let tocHeight = parseInt($(window).height() * 0.4 - 64);
        let $tocWidget = $('.toc-widget');
        $(window).scroll(function () {
            let scroll = $(window).scrollTop();
            /* add post toc fixed. */
            if (scroll > tocHeight) {
                $tocWidget.addClass('toc-fixed');
            } else {
                $tocWidget.removeClass('toc-fixed');
            }
        });

        
        /* 修复文章卡片 div 的宽度. */
        let fixPostCardWidth = function (srcId, targetId) {
            let srcDiv = $('#' + srcId);
            if (srcDiv.length === 0) {
                return;
            }

            let w = srcDiv.width();
            if (w >= 450) {
                w = w + 21;
            } else if (w >= 350 && w < 450) {
                w = w + 18;
            } else if (w >= 300 && w < 350) {
                w = w + 16;
            } else {
                w = w + 14;
            }
            $('#' + targetId).width(w);
        };

        // 切换TOC目录展开收缩的相关操作.
        const expandedClass = 'expanded';
        let $tocAside = $('#toc-aside');
        let $mainContent = $('#main-content');
        $('#floating-toc-btn .btn-floating').click(function () {
            if ($tocAside.hasClass(expandedClass)) {
                $tocAside.removeClass(expandedClass).hide();
                $mainContent.removeClass('l9');
            } else {
                $tocAside.addClass(expandedClass).show();
                $mainContent.addClass('l9');
            }
            fixPostCardWidth('artDetail', 'prenext-posts');
        });
        
    });
</script>

    

</main>




    <footer class="page-footer bg-color">
    

    <div class="container row center-align"
         style="margin-bottom: 0px !important;">
        <div class="col s12 m8 l8 copy-right">
            Copyright&nbsp;&copy;
            
                <span id="year">2019-2021</span>
            
            <span id="year">2019</span>
            <a href="/about" target="_blank">Jackson</a>
            |&nbsp;Powered by&nbsp;<a href="https://hexo.io/" target="_blank">Hexo</a>
            |&nbsp;Theme&nbsp;<a href="https://github.com/blinkfox/hexo-theme-matery" target="_blank">Matery</a>
            <br>
            
            
            
                
            
            
                <span id="busuanzi_container_site_pv">
                &nbsp;|&nbsp;<i class="far fa-eye"></i>&nbsp;总访问量:&nbsp;
                    <span id="busuanzi_value_site_pv" class="white-color"></span>
            </span>
            
            
                <span id="busuanzi_container_site_uv">
                &nbsp;|&nbsp;<i class="fas fa-users"></i>&nbsp;总访问人数:&nbsp;
                    <span id="busuanzi_value_site_uv" class="white-color"></span>
            </span>
            
            <br>

            <!-- 运行天数提醒. -->
            
            <br>
            
        </div>
        <div class="col s12 m4 l4 social-link social-statis">
    <a href="https://github.com/jacksonyoudi" class="tooltipped" target="_blank" data-tooltip="访问我的GitHub" data-position="top" data-delay="50">
        <i class="fab fa-github"></i>
    </a>



    <a href="mailto:liangchangyoujackson@gmail.com" class="tooltipped" target="_blank" data-tooltip="邮件联系我" data-position="top" data-delay="50">
        <i class="fas fa-envelope-open"></i>
    </a>













</div>
    </div>
</footer>

<div class="progress-bar"></div>


    <!-- 搜索遮罩框 -->
<div id="searchModal" class="modal">
    <div class="modal-content">
        <div class="search-header">
            <span class="title"><i class="fas fa-search"></i>&nbsp;&nbsp;搜索</span>
            <input type="search" id="searchInput" name="s" placeholder="请输入搜索的关键字"
                   class="search-input">
        </div>
        <div id="searchResult"></div>
    </div>
</div>

<script type="text/javascript">
$(function () {
    var searchFunc = function (path, search_id, content_id) {
        'use strict';
        $.ajax({
            url: path,
            dataType: "xml",
            success: function (xmlResponse) {
                // get the contents from search data
                var datas = $("entry", xmlResponse).map(function () {
                    return {
                        title: $("title", this).text(),
                        content: $("content", this).text(),
                        url: $("url", this).text()
                    };
                }).get();
                var $input = document.getElementById(search_id);
                var $resultContent = document.getElementById(content_id);
                $input.addEventListener('input', function () {
                    var str = '<ul class=\"search-result-list\">';
                    var keywords = this.value.trim().toLowerCase().split(/[\s\-]+/);
                    $resultContent.innerHTML = "";
                    if (this.value.trim().length <= 0) {
                        return;
                    }
                    // perform local searching
                    datas.forEach(function (data) {
                        var isMatch = true;
                        var data_title = data.title.trim().toLowerCase();
                        var data_content = data.content.trim().replace(/<[^>]+>/g, "").toLowerCase();
                        var data_url = data.url;
                        data_url = data_url.indexOf('/') === 0 ? data.url : '/' + data_url;
                        var index_title = -1;
                        var index_content = -1;
                        var first_occur = -1;
                        // only match artiles with not empty titles and contents
                        if (data_title !== '' && data_content !== '') {
                            keywords.forEach(function (keyword, i) {
                                index_title = data_title.indexOf(keyword);
                                index_content = data_content.indexOf(keyword);
                                if (index_title < 0 && index_content < 0) {
                                    isMatch = false;
                                } else {
                                    if (index_content < 0) {
                                        index_content = 0;
                                    }
                                    if (i === 0) {
                                        first_occur = index_content;
                                    }
                                }
                            });
                        }
                        // show search results
                        if (isMatch) {
                            str += "<li><a href='" + data_url + "' class='search-result-title'>" + data_title + "</a>";
                            var content = data.content.trim().replace(/<[^>]+>/g, "");
                            if (first_occur >= 0) {
                                // cut out 100 characters
                                var start = first_occur - 20;
                                var end = first_occur + 80;
                                if (start < 0) {
                                    start = 0;
                                }
                                if (start === 0) {
                                    end = 100;
                                }
                                if (end > content.length) {
                                    end = content.length;
                                }
                                var match_content = content.substr(start, end);
                                // highlight all keywords
                                keywords.forEach(function (keyword) {
                                    var regS = new RegExp(keyword, "gi");
                                    match_content = match_content.replace(regS, "<em class=\"search-keyword\">" + keyword + "</em>");
                                });

                                str += "<p class=\"search-result\">" + match_content + "...</p>"
                            }
                            str += "</li>";
                        }
                    });
                    str += "</ul>";
                    $resultContent.innerHTML = str;
                });
            }
        });
    };

    searchFunc('/search.xml', 'searchInput', 'searchResult');
});
</script>

    <!-- 回到顶部按钮 -->
<div id="backTop" class="top-scroll">
    <a class="btn-floating btn-large waves-effect waves-light" href="#!">
        <i class="fas fa-arrow-up"></i>
    </a>
</div>


    <script src="/libs/materialize/materialize.min.js"></script>
    <script src="/libs/masonry/masonry.pkgd.min.js"></script>
    <script src="/libs/aos/aos.js"></script>
    <script src="/libs/scrollprogress/scrollProgress.min.js"></script>
    <script src="/libs/lightGallery/js/lightgallery-all.min.js"></script>
    <script src="/js/matery.js"></script>

    <!-- Baidu Analytics -->

    <!-- Baidu Push -->

<script>
    (function () {
        var bp = document.createElement('script');
        var curProtocol = window.location.protocol.split(':')[0];
        if (curProtocol === 'https') {
            bp.src = 'https://zz.bdstatic.com/linksubmit/push.js';
        } else {
            bp.src = 'http://push.zhanzhang.baidu.com/push.js';
        }
        var s = document.getElementsByTagName("script")[0];
        s.parentNode.insertBefore(bp, s);
    })();
</script>

    
    <script src="/libs/others/clicklove.js" async="async"></script>
    
    
    <script async src="/libs/others/busuanzi.pure.mini.js"></script>
    

    

    

    <!--腾讯兔小巢-->
    
    

    

    

    
    <script src="/libs/instantpage/instantpage.js" type="module"></script>
    

</body>

</html>
